library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer) ; library(ggpubr)
library(biomaRt)
library(polycor)
library(caret) ; library(ROCR) ; library(car) ; library(MLmetrics)
library(corrplot)
library(expss) ; library(knitr) ; library(kableExtra)
library(foreach) ; library(doParallel)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybrid'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname
assigned_module = clusterings %>% dplyr::select(ID, Module)
# Dataset created with 20_04_07_create_dataset.html
dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)
rownames_dataset = rownames(dataset)
dataset = dataset %>% mutate(Module = clusterings$Module, gene.score = as.character(gene.score)) %>%
mutate(gene.score = ifelse(gene.score=='Others', 'None', gene.score)) %>%
dplyr::select(-matches(clustering_selected))
rownames(dataset) = rownames_dataset
# Fix any Gene Significance that is NA
GS_missing = rownames(dataset)[is.na(dataset$GS)]
if(length(GS_missing)>0){
# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
for(g in GS_missing){
dataset$GS[rownames(dataset) == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
}
}
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)
rm(getinfo, mart, rownames_dataset, GO_annotations, g, GS_missing)
The features that will be considered for the classification model will be the ones WGCNA uses to identify significant modules and genes:
Correlation of a gene’s expression pattern to diagnosis: Using Gene Significance
Correlation of a gene’s module to Diagnosis: Using Module-Diagnosis correlation
Module assignment: Here, instead of just indicating the cluster the gene belongs to in a binary way, we decided to use the Module Membership, which measures the similarity of each gene to the module they belong, and we decided to also add the Module Membership to all of the other modules as well, since this could give us much more information than the original binary assignment (using eigengenes)
Filtering the 1830 genes that were not assigned to any cluster (represented as the gray cluster)
rm_cluster = dataset[is.na(dataset$MTcor),'Module'] %>% unique %>% as.character
new_dataset = dataset %>% filter(Module != 'gray' & !is.na(MTcor)) %>%
dplyr::select(-c(matches(paste('pval|Module')), MMgray)) %>%
mutate('absGS'=abs(GS), 'SFARI'=gene.score!='None') %>% dplyr::select(-gene.score)
rownames(new_dataset) = rownames(dataset)[dataset$Module != 'gray']
rm(rm_cluster)
Using Module Membership variables instead of binary module membership
Including a new variable with the absolute value of GS
Removing genes assigned to the gray module (unclassified genes)
Adding the Objective variable: Binary label indicating if it’s in the SFARI dataset or not
table_info = new_dataset %>% head(5) %>% t
table_info %>% kable(caption = '(Transposed) features and their values for the first rows of dataset',
col.names = colnames(table_info)) %>% kable_styling(full_width = F)
| ENSG00000174840 | ENSG00000121410 | ENSG00000078328 | ENSG00000175899 | ENSG00000166535 | |
|---|---|---|---|---|---|
| MTcor | 0.0832482 | -0.2137853 | -0.9145290 | -0.3034897 | 0.2190318 |
| GS | 0.0946359 | -0.2088418 | -0.4470894 | -0.2260618 | 0.1485608 |
| MM.DE8C00 | -0.0813327 | -0.0562470 | -0.0427428 | 0.2965456 | 0.0362411 |
| MM.6FB000 | -0.4548321 | -0.1166111 | -0.3156040 | -0.3129274 | -0.0840447 |
| MM.00C086 | -0.2493116 | -0.2158016 | -0.1905838 | -0.4318493 | -0.0385317 |
| MM.F57963 | -0.4332353 | -0.0098177 | -0.0655534 | -0.0063333 | -0.1432524 |
| MM.DA8E00 | 0.0768426 | -0.0539677 | -0.0741071 | -0.1747240 | -0.2641453 |
| MM.00BDD4 | 0.2332492 | 0.0207811 | 0.0180752 | -0.2393298 | -0.0028267 |
| MM.00BF74 | 0.1102612 | -0.0487173 | -0.1954640 | -0.2362997 | -0.0288798 |
| MM.00A7FF | -0.0627492 | -0.0140640 | -0.1196216 | -0.2903817 | -0.2081182 |
| MM.5A9DFF | -0.1005093 | 0.0912025 | 0.0866405 | -0.2145973 | -0.0720524 |
| MM.00C08E | 0.1686922 | -0.0506604 | -0.2431874 | -0.1955273 | 0.0081254 |
| MM.FF64B1 | 0.2028247 | -0.0198433 | -0.0208405 | -0.2608037 | 0.1219179 |
| MM.D69100 | -0.1755946 | 0.1545721 | -0.0156458 | 0.0452504 | -0.2916027 |
| MM.BC81FF | 0.1803425 | 0.0430306 | -0.0049419 | 0.2805490 | -0.1306227 |
| MM.00AAFD | 0.0359790 | -0.0001570 | -0.4124447 | 0.1065160 | 0.0361258 |
| MM.9CA700 | 0.0459371 | -0.1639700 | -0.4440864 | -0.0686088 | -0.0172798 |
| MM.00BB4B | 0.3436858 | -0.0195388 | -0.3165312 | 0.0396536 | 0.0836749 |
| MM.A789FF | 0.1747797 | 0.0790658 | -0.2700054 | 0.1450093 | 0.0306923 |
| MM.8E92FF | 0.0610536 | -0.1444906 | -0.2406208 | -0.1504703 | -0.0595903 |
| MM.00BECE | -0.1867569 | -0.2473776 | -0.4344758 | -0.4140629 | -0.1093526 |
| MM.79AF00 | -0.0068428 | -0.1944098 | -0.5512190 | -0.2593433 | -0.0956816 |
| MM.00BF7D | -0.0250652 | -0.2851167 | -0.5991594 | -0.1310486 | 0.1450658 |
| MM.2CB600 | -0.1626872 | -0.3090055 | -0.5227051 | -0.2438819 | 0.0868732 |
| MM.8CAB00 | 0.0066877 | -0.1345929 | -0.5078961 | -0.3926168 | -0.0442148 |
| MM.E68710 | 0.0442406 | -0.2147079 | -0.5732910 | -0.3120124 | -0.1342483 |
| MM.00BCDA | -0.1286629 | -0.1446787 | -0.4015763 | -0.3459154 | 0.1543522 |
| MM.7F96FF | -0.1524808 | -0.0911292 | -0.3353314 | -0.5333744 | -0.0181422 |
| MM.E28900 | -0.2106204 | 0.1352302 | 0.1082066 | 0.1389685 | -0.1830438 |
| MM.00A4FF | 0.0573277 | 0.1818729 | -0.1196505 | 0.0848474 | 0.1758487 |
| MM.D575FE | -0.3219647 | 0.1767890 | -0.1421686 | 0.1295538 | 0.0016537 |
| MM.FF6C91 | -0.4037728 | 0.3468478 | 0.3071323 | 0.4035206 | 0.0485337 |
| MM.00C19E | -0.1522857 | -0.0555546 | -0.1482107 | -0.2722913 | -0.0711533 |
| MM.C79800 | 0.0309320 | -0.0698142 | 0.1716357 | -0.2161322 | -0.0679629 |
| MM.00B5EE | 0.0560484 | -0.0355969 | -0.2069813 | -0.3117382 | 0.0714419 |
| MM.6E99FF | -0.1884645 | -0.0846878 | -0.3388231 | -0.2795725 | 0.1314537 |
| MM.00C0B4 | -0.1010749 | -0.2027009 | -0.0762083 | -0.3378813 | -0.0301776 |
| MM.BC9D00 | -0.2162542 | 0.0387011 | 0.1497072 | -0.3360844 | -0.0780279 |
| MM.CD79FF | -0.1836296 | 0.0684606 | 0.1001112 | -0.3119923 | -0.0364334 |
| MM.F07E4B | -0.0918536 | -0.0007418 | 0.2622670 | -0.0906417 | 0.1928493 |
| MM.FD61D3 | -0.1648623 | 0.1058354 | 0.4004618 | -0.2103919 | 0.0696020 |
| MM.55B300 | -0.0992350 | -0.0748253 | 0.1283496 | -0.4167064 | -0.0308686 |
| MM.FF63B8 | -0.2231235 | -0.0097780 | 0.2169574 | -0.3204277 | -0.1182891 |
| MM.F8766D | 0.0018065 | 0.1787646 | 0.0972655 | 0.1169564 | 0.0469163 |
| MM.ED813E | 0.1747014 | 0.1500265 | 0.3751523 | -0.0292345 | 0.1926426 |
| MM.F863DF | 0.0578081 | 0.2341973 | 0.3536940 | -0.0976476 | 0.0539418 |
| MM.00B7E9 | 0.1485387 | -0.0559437 | -0.0679460 | -0.2627636 | 0.1747425 |
| MM.94A900 | 0.1403746 | -0.0928804 | 0.0697380 | -0.3723450 | 0.0959963 |
| MM.00ADFA | 0.1639550 | 0.1062030 | 0.5221455 | 0.0800016 | -0.1757568 |
| MM.83AD00 | 0.0873033 | 0.2833470 | 0.3423678 | 0.1297448 | -0.0954132 |
| MM.44B500 | 0.3713062 | -0.0869081 | 0.2244425 | 0.0569648 | 0.0842512 |
| MM.DB72FB | 0.4066223 | 0.0136957 | 0.1942735 | -0.0255669 | -0.0923165 |
| MM.E9842C | 0.4726265 | -0.0762786 | 0.0989248 | -0.1288009 | 0.0674142 |
| MM.00BD61 | 0.5210802 | 0.0107588 | 0.0010740 | 0.2004524 | 0.0926400 |
| MM.C29B00 | 0.4831548 | 0.1114200 | 0.3375801 | 0.1484716 | 0.1278595 |
| MM.00BA3D | 0.1682624 | 0.1315209 | 0.1374211 | 0.0476274 | -0.1058637 |
| MM.FF62BF | -0.0557176 | 0.1265554 | 0.0988161 | 0.0112068 | -0.0510165 |
| MM.00BE6B | 0.0631851 | -0.0834246 | -0.2719844 | 0.3030880 | 0.0016276 |
| MM.D19300 | 0.1417563 | -0.0884539 | -0.3787281 | 0.2912178 | -0.0484254 |
| MM.00B92D | 0.3595897 | -0.0279321 | -0.3241466 | -0.0835873 | -0.0642214 |
| MM.AAA300 | 0.4640766 | 0.0445820 | 0.0295084 | 0.2533439 | -0.0879892 |
| MM.00BADF | 0.2055705 | 0.1287859 | 0.2109354 | -0.0232069 | -0.1623422 |
| MM.00C0BA | 0.1124727 | -0.1240675 | 0.1433735 | 0.1223364 | -0.1070138 |
| MM.00C1AD | 0.0754203 | -0.0627419 | -0.3283389 | 0.2385344 | 0.1154950 |
| MM.B0A100 | -0.0508591 | 0.1454518 | 0.0394742 | 0.3385572 | -0.0889242 |
| MM.FF61C6 | -0.1501804 | -0.0803551 | -0.1479026 | 0.0553914 | 0.0111525 |
| MM.3EA1FF | 0.2304045 | 0.2117181 | 0.0220890 | 0.2721439 | 0.1484570 |
| MM.FA7477 | 0.2540248 | 0.2713906 | 0.3562823 | 0.3399675 | 0.2209892 |
| MM.C57DFF | -0.0004470 | -0.0423414 | 0.0046082 | 0.4714587 | 0.0014977 |
| MM.CC9600 | 0.2016330 | 0.0309888 | 0.0323509 | 0.3465395 | 0.1563802 |
| MM.00C1A5 | 0.0184487 | -0.0328614 | 0.1292002 | -0.0627340 | 0.0678604 |
| MM.FE61CD | -0.0033463 | -0.0425762 | -0.2637903 | -0.0157298 | 0.0716888 |
| MM.F464E4 | -0.2744923 | -0.1487886 | -0.1713998 | -0.1130489 | 0.0866947 |
| MM.B69F00 | -0.0726044 | -0.1572332 | -0.0643350 | 0.1028033 | 0.0131138 |
| MM.F066EA | -0.0787652 | -0.0752724 | 0.1268331 | 0.0168737 | 0.1381884 |
| MM.FA62D9 | 0.0148696 | 0.0636646 | -0.2874563 | -0.1779839 | -0.0072724 |
| MM.00BFC1 | 0.0723657 | 0.0396408 | 0.0664050 | 0.2632726 | 0.1231384 |
| MM.FC7180 | 0.1836392 | -0.1895422 | -0.4006710 | -0.0123738 | 0.2717846 |
| MM.00B8E4 | 0.2255110 | 0.0652075 | 0.1777184 | 0.2407287 | 0.0180128 |
| MM.00B0F6 | 0.3100602 | -0.1014479 | 0.2722409 | 0.0673448 | -0.0765110 |
| MM.FF66A9 | 0.0445709 | 0.0530329 | 0.4204220 | 0.2318865 | -0.2417528 |
| MM.00BC56 | -0.0563847 | 0.2181936 | 0.4328106 | 0.1377576 | -0.2324858 |
| MM.00BFC8 | -0.0481471 | 0.0472750 | 0.3811959 | -0.1479967 | -0.1800069 |
| MM.E26EF7 | -0.0698042 | 0.2161273 | 0.5563800 | 0.3859588 | 0.0372900 |
| MM.00C096 | -0.0875981 | 0.0454643 | 0.4424028 | 0.1957238 | 0.1606864 |
| MM.00B813 | 0.0132938 | 0.1460011 | 0.6040098 | 0.1790479 | 0.0706359 |
| MM.63B200 | 0.1651734 | 0.1694572 | 0.5846569 | 0.3462267 | 0.1601311 |
| MM.A3A500 | -0.1722826 | 0.1903747 | 0.5483385 | 0.3127235 | -0.0023800 |
| MM.00B2F2 | -0.0958665 | 0.2050325 | 0.5628584 | 0.0988988 | -0.1448892 |
| MM.FE6E89 | -0.2258332 | 0.1795930 | 0.5157362 | 0.1972410 | -0.2019764 |
| MM.9B8EFF | 0.0000429 | 0.2379469 | 0.3543707 | 0.6145468 | -0.0369566 |
| MM.F37C58 | -0.3405722 | 0.2391375 | 0.3777951 | 0.3933602 | -0.0841988 |
| MM.E76BF3 | 0.2590833 | 0.0291549 | 0.3382920 | 0.2467875 | -0.2286062 |
| MM.FF68A2 | -0.2010435 | 0.1146447 | 0.3463303 | 0.1604674 | -0.0712511 |
| MM.FF6A9A | -0.2404087 | 0.0764843 | 0.1524623 | 0.1295565 | -0.1168925 |
| MM.B285FF | 0.0708652 | 0.0495138 | 0.3861183 | 0.3738267 | -0.0348559 |
| MM.EC69EE | 0.0344338 | 0.0792035 | 0.1817272 | 0.5724684 | 0.0207306 |
| absGS | 0.0946359 | 0.2088418 | 0.4470894 | 0.2260618 | 0.1485608 |
| SFARI | 0.0000000 | 0.0000000 | 1.0000000 | 0.0000000 | 0.0000000 |
rm(table_info)
original_dataset = dataset
dataset = new_dataset
rm(new_dataset)
The final dataset contains 13628 observations (genes) and 99 variables
The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and both SFARI and absGS are in the middle of both groups (just like with Gandal’s, Gupta’s and Wright’s datasets)
pca = dataset %>% mutate(SFARI = as.numeric(SFARI)) %>% t %>% prcomp
plot_data = data.frame('ID'=colnames(dataset), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2],
type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))
mtcor_by_module = original_dataset %>% dplyr::select(Module, MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')
plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(aes(id=ID)) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
scale_colour_distiller(palette = 'RdBu', na.value = 'darkgrey') + theme_minimal() +
ggtitle('PCA of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, pca)
The two main patterns that seem to characterise the genes are their Gene Significance and the Module-Diagnosis correlation of their corresponding module
Mean Expression doesn’t seem to play an important role
SFARI Genes seem to be evenly distributed everywhere
It’s not clear what the 2nd principal component is capturing, perhaps a bit of mean level of expression?
# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
datMeta = datMeta %>% mutate(ID = paste0('X',description))
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))
# PCA
pca = dataset %>% t %>% prcomp
plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2],
'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')
p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.4) + scale_color_viridis() +
theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme(legend.position='bottom')
p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.4) + scale_color_viridis() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')
p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(alpha = plot_data$alpha) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)
p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.4) + scale_color_viridis() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')
grid.arrange(p1, p2, p3, p4, nrow=2)
rm(pca, datExpr, datGenes, datMeta, dds, DE_info, mean_expr, p1, p2, p3, p4)
5.02% of the observations are positive. This can be a problem when training the classification model, so the samples in the training set should be balanced between classes before the model is trained.
table_info = dataset %>% apply_labels(SFARI = 'SFARI')
cro(table_info$SFARI)
| #Total | |
|---|---|
| SFARI | |
| FALSE | 12944 |
| TRUE | 684 |
| #Total cases | 13628 |
rm(table_info)
To divide our samples into training and test sets:
Use 75% and 25% of the samples respectively
Even though our model’s label is binary, we are using the original SFARI Scores to do the partition in training and test sets to maintain the original score proportions in each set
set.seed(123)
sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score),
by = 'ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))
train_idx = createDataPartition(sample_scores$gene.score, p = 0.7, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]
rm(sample_scores, train_idx)
To fix this class imbalance, we are going to use SMOTE, an over-sampling technique that over-samples the minority class (SFARI Genes) by creating synthetic examples, in the training set
cro(train_set$SFARI)
| #Total | |
|---|---|
| train_set$SFARI | |
| FALSE | 9061 |
| TRUE | 480 |
| #Total cases | 9541 |
This set is used just to evaluate how well the model performs, so the class imbalance is not a problem here
cro(test_set$SFARI)
| #Total | |
|---|---|
| test_set$SFARI | |
| FALSE | 3883 |
| TRUE | 204 |
| #Total cases | 4087 |
# https://shiring.github.io/machine_learning/2017/04/02/unbalanced
# https://topepo.github.io/caret/using-your-own-model-in-train.html#Illustration5
train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
k_fold = 10
cv_repeats = 5
smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final',
summaryFunction = twoClassSummary, sampling = 'smote')
# Using ROC as metric because it doesn't depend on the threshold
fit = caret::train(SFARI ~ ., data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
trControl = smote_over_sampling)
# There is some perfect multicollinearity that doesn't let us do the vif analysis, so I'll remove those variables (you cannot use alias in caret::train, so I had to train the model again directly with glm)
ld.vars = attributes(alias(glm(SFARI~., data = train_set, family = 'binomial'))$Complete)$dimnames[[1]]
# Remove the linearly dependent variables variables
formula.new = as.formula( paste('SFARI ~ .', paste(ld.vars, collapse='-'), sep='-') )
# Retrain model without these variables
fit = caret::train(formula.new, data = train_set, method = 'glm', family = 'binomial', metric = 'ROC',
trControl = smote_over_sampling)
rm(smote_over_sampling, ld.vars, formula.new, k_fold, cv_repeats)
The model has an AUC of 0.6062
But the features are strongly correlated, which inflates the standard error of the coefficients, making them no longer interpretable, so perhaps it would be better to use another model
# VIF
plot_data = data.frame('Feature' = car::vif(fit$finalModel) %>% sort %>% names,
'VIF' = car::vif(fit$finalModel) %>% sort %>% unname) %>%
mutate(outlier = VIF>10)
plot_data %>% ggplot(aes(reorder(Feature, -VIF), VIF, fill = !outlier)) + geom_bar(stat='identity') +
scale_y_log10() + geom_hline(yintercept = 10, color = 'gray', linetype = 'dashed') +
xlab('Model Features') + ggtitle('Variance Inflation Number for each Feature') + theme_minimal() +
theme(legend.position = 'none', axis.text.x = element_text(angle = 90, hjust = 1))
rm(plot_data)
Correlation plot
corrplot.mixed(cor(train_set[,-ncol(train_set)]), lower = 'number', lower.col = 'gray', number.cex = .6,
tl.pos = 'l', tl.col = '#666666')
Remove all variables with a VIF>10: We would lose all but two of our variables, not ideal
Do Principal Component Regression: We would lose the relation between the prediction and the original features, which could be interesting to study
Don’t do anything: Multicollinearity affects the coefficients and p-values of the regression, but it doesn’t affect the predictions, precision of the predictions or the goodness-of-fit statistics ref, but as with the previous option, we cannot study the coefficients of the regression
Use Ridge Regression: The penalty it gives to high coefficients reduces the variance introduced by the correlation, making the coefficients interpretable again
Notes:
Running the model multiple times to get more acurate measurements of its performance
Over-sampling positive samples in the training set to obtain a 1:1 class ratio using SMOTE
Performing 5 repetitions of cross validation with 10-folds each
### DEFINE FUNCTIONS
create_train_test_sets = function(p, seed){
# Get SFARI Score of all the samples so our train and test sets are balanced for each score
sample_scores = dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID) %>%
left_join(original_dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, gene.score),
by = 'ID') %>%
mutate(gene.score = ifelse(is.na(gene.score), 'None', gene.score))
set.seed(seed)
train_idx = createDataPartition(sample_scores$gene.score, p = p, list = FALSE)
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]
return(list('train_set' = train_set, 'test_set' = test_set))
}
run_model = function(p, seed){
# Create train and test sets
train_test_sets = create_train_test_sets(p, seed)
train_set = train_test_sets[['train_set']]
test_set = train_test_sets[['test_set']]
# Train Model
train_set = train_set %>% mutate(SFARI = ifelse(SFARI==TRUE, 'SFARI', 'not_SFARI') %>% as.factor)
lambda_seq = 10^seq(1, -4, by = -.1)
set.seed(seed)
k_fold = 10
cv_repeats = 5
smote_over_sampling = trainControl(method = 'repeatedcv', number = k_fold, repeats = cv_repeats,
verboseIter = FALSE, classProbs = TRUE, savePredictions = 'final',
summaryFunction = twoClassSummary, sampling = 'smote')
fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = smote_over_sampling, metric = 'ROC',
tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
# Predict labels in test set
predictions = fit %>% predict(test_set, type = 'prob')
preds = data.frame('ID' = rownames(test_set), 'prob' = predictions$SFARI) %>% mutate(pred = prob>0.5)
# Measure performance of the model
acc = mean(test_set$SFARI==preds$pred)
prec = Precision(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
rec = Recall(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
F1 = F1_Score(test_set$SFARI %>% as.numeric, preds$pred %>% as.numeric, positive = '1')
pred_ROCR = prediction(preds$prob, test_set$SFARI)
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
# Extract coefficients from features
coefs = coef(fit$finalModel, fit$bestTune$lambda) %>% as.vector
return(list('acc' = acc, 'prec' = prec, 'rec' = rec, 'F1' = F1, 'AUC' = AUC, 'preds' = preds, 'coefs' =coefs))
}
### RUN MODEL
# Parameters
p = 0.75
n_iter = 25
seeds = 123:(123+n_iter-1)
# Store outputs
acc = c()
prec = c()
rec = c()
F1 = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)
coefs = data.frame('var' = c('Intercept', colnames(dataset[,-ncol(dataset)])), 'coef' = 0)
for(seed in seeds){
# Run model
model_output = run_model(p, seed)
# Update outputs
acc = c(acc, model_output[['acc']])
prec = c(prec, model_output[['prec']])
rec = c(rec, model_output[['rec']])
F1 = c(F1, model_output[['F1']])
AUC = c(AUC, model_output[['AUC']])
preds = model_output[['preds']]
coefs$coef = coefs$coef + model_output[['coefs']]
update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in%
preds$ID, c('prob','pred','n')] +
update_preds
}
coefs = coefs %>% mutate(coef = coef/n_iter)
predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)
rm(p, seeds, update_preds, create_train_test_sets, run_model)
To summarise in a single value the predictions of the models:
prob = Average of all the probabilities
pred = 1 if prob>0.5, 0 otherwise
test_set = predictions %>% filter(n>0) %>%
left_join(dataset %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, GS, MTcor), by = 'ID')
rownames(test_set) = predictions$ID[predictions$n>0]
conf_mat = test_set %>% apply_labels(SFARI = 'Actual Labels',
prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(conf_mat$SFARI, list(conf_mat$pred, total()))
| Label Prediction | #Total | |||
|---|---|---|---|---|
| FALSE | TRUE | |||
| Actual Labels | ||||
| FALSE | 10230 | 2702 | 12932 | |
| TRUE | 443 | 241 | 684 | |
| #Total cases | 10673 | 2943 | 13616 | |
rm(conf_mat)
pred_ROCR = prediction(test_set$prob, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]
plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(mean(AUC),2),')'), col='#009999')
abline(a=0, b=1, col='#666666')
lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')
rm(pred_ROCR, roc_ROCR, AUC, lift_ROCR)
MTcor has a very small coefficient
gene_corr_info = dataset %>% mutate('ID' = rownames(dataset)) %>% dplyr::select(ID, MTcor, SFARI) %>%
left_join(assigned_module, by ='ID') %>% mutate(Module = gsub('#','',Module))
coef_info = coefs %>% mutate('feature' = gsub('MM.','',var)) %>%
left_join(gene_corr_info, by = c('feature' = 'Module')) %>%
dplyr::select(feature, coef, MTcor, SFARI) %>% group_by(feature, coef, MTcor) %>%
summarise('SFARI_perc' = mean(SFARI)) %>% arrange(desc(coef))
coef_info %>% dplyr::select(feature, coef) %>% filter(feature %in% c('Intercept','GS','absGS','MTcor')) %>%
dplyr::rename('Feature' = feature, 'Coefficient' = coef) %>%
kable(align = 'cc',
caption = 'Regression Coefficients (filtering MM features)') %>%
kable_styling(full_width = F)
| Feature | Coefficient |
|---|---|
| absGS | 0.1383954 |
| GS | 0.0794006 |
| MTcor | -0.0209554 |
| Intercept | -0.4461271 |
There is a positive relation between the coefficient assigned to the membership of each module and the enrichment (using ORA) in SFARI genes that are assigned to that module
load('./../Data/ORA.RData')
enrichment_SFARI_info = data.frame('Module'=as.character(), 'SFARI_enrichment'=as.numeric())
for(m in names(enrichment_SFARI)){
m_info = enrichment_SFARI[[m]]
enrichment = 1-ifelse('SFARI' %in% m_info$ID, m_info$pvalue[m_info$ID=='SFARI'],1)
enrichment_SFARI_info = enrichment_SFARI_info %>%
add_row(Module = gsub('#','',m), SFARI_enrichment = enrichment)
}
plot_data = coef_info %>% dplyr::rename('Module' = feature) %>%
left_join(enrichment_SFARI_info, by = 'Module') %>% filter(!is.na(MTcor))
ggplotly(plot_data %>% ggplot(aes(coef, SFARI_enrichment)) +
geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',plot_data$Module), alpha=0.7) +
theme_minimal() + xlab('Coefficient') +
ylab('SFARI Genes Enrichment'))
rm(enrichment_old_SFARI, enrichment_DGN, enrichment_DO, enrichment_GO, enrichment_KEGG, enrichment_Reactome,
m, m_info, enrichment)
There doesn’t seem to be a relation between the coefficient and the correlation of the module and the diagnosis.
This is not a surprise since we knew that there was a negative relation between SFARI genes and Module-Diagnosis correlation from Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_WGCNA_modules_EA.html. The fact that there is no relation between coefficient and Module-Diagnosis correlation could even be a good sign that the model is picking some biological signal as well as the SFARI patterns (since the relation with the biological signals is positive)
ggplotly(coef_info %>% dplyr::rename('Module' = feature) %>% filter(!is.na(MTcor)) %>%
ggplot(aes(coef, MTcor)) + geom_smooth(method = 'lm', color = 'gray', alpha = 0.1) +
geom_point(aes(id = Module), color = paste0('#',coef_info$feature[!is.na(coef_info$MTcor)]),
alpha = 0.7) +
theme_minimal() + xlab('Coefficient') +
ylab('Module-Diagnosis correlation'))
SFARI genes have a higher score distribution than the rest, but the overlap is large
plot_data = test_set %>% dplyr::select(prob, SFARI)
ggplotly(plot_data %>% ggplot(aes(prob, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
geom_vline(xintercept = mean(plot_data$prob[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
geom_vline(xintercept = mean(plot_data$prob[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
theme_minimal() + ggtitle('Model score distribution by SFARI Label'))
Even though we didn’t use the actual SFARI Scores to train the model, but instead we grouped them all together, there seems to be a statistically significant positive relation between the SFARI scores and the probability assigned by the model
plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
dplyr::select(ID, prob, gene.score) %>% apply_labels(gene.score='SFARI Gene score')
cro(plot_data$gene.score)
| #Total | |
|---|---|
| SFARI Gene score | |
| 1 | 128 |
| 2 | 180 |
| 3 | 376 |
| Neuronal | 840 |
| Others | 12092 |
| #Total cases | 13616 |
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_prob = mean(prob))
comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
c('1','3'), c('3','Others'), c('2','Neuronal'),
c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.08
base = 0.9
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)
plot_data %>% ggplot(aes(gene.score, prob, fill=gene.score)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test',
method.args = list(var.equal = FALSE), label.y = pos_y_comparisons,
tip.length = .02) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
ggtitle('Distribution of probabilities by SFARI score') +
xlab('SFARI score') + ylab('Probability') + theme_minimal() + theme(legend.position = 'none')
rm(mean_vals, increase, base, pos_y_comparisons)
Considering the class imbalance in the test set (1:19), there are many more SFARI scores in here (1:10)
High concentration of genes with a SFARI Score of 1
test_set %>% dplyr::select(prob, SFARI) %>% mutate(ID = rownames(test_set)) %>%
arrange(desc(prob)) %>% top_n(50, wt=prob) %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
mutate(gene.score = ifelse(gene.score=='None', ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others'),
gene.score)) %>%
left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = prob, 'ModuleDiagnosis_corr' =MTcor,
'GeneSignificance' = GS) %>%
mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr, 4), Probability = round(Probability, 4),
GeneSignificance = round(GeneSignificance, 4)) %>%
dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability,
gene.score) %>%
kable(caption = 'Genes with highest model probabilities from the test set') %>%
kable_styling(full_width = F)
| GeneSymbol | GeneSignificance | ModuleDiagnosis_corr | Module | Probability | gene.score |
|---|---|---|---|---|---|
| ANAPC1 | 0.0710 | 0.2018 | #3EA1FF | 0.7958 | Others |
| SP4 | -0.0995 | 0.2018 | #3EA1FF | 0.7919 | Others |
| ZNF649 | 0.1808 | 0.6113 | #A789FF | 0.7905 | Others |
| BRPF3 | -0.2706 | 0.2208 | #00B92D | 0.7901 | Others |
| SPIN4 | 0.4077 | 0.6113 | #A789FF | 0.7726 | Others |
| ROBO1 | -0.0273 | 0.6113 | #A789FF | 0.7719 | Neuronal |
| SGCB | 0.4277 | 0.6113 | #A789FF | 0.7647 | Others |
| MYO10 | -0.1091 | 0.1508 | #00A4FF | 0.7573 | Neuronal |
| NAGS | 0.1168 | -0.2138 | #FF6C91 | 0.7546 | Others |
| PCDHB10 | 0.2804 | 0.6113 | #A789FF | 0.7453 | Others |
| RIF1 | 0.1954 | 0.6113 | #A789FF | 0.7426 | Others |
| GABRG1 | 0.1293 | 0.1508 | #00A4FF | 0.7419 | Others |
| USP25 | 0.2559 | 0.6113 | #A789FF | 0.7418 | Others |
| PDGFRA | 0.4996 | 0.6113 | #A789FF | 0.7402 | Others |
| KCTD3 | 0.1536 | 0.0832 | #00BD61 | 0.7391 | Others |
| SPOPL | -0.1516 | -0.2138 | #FF6C91 | 0.7380 | Others |
| NLGN1 | 0.3103 | 0.6113 | #A789FF | 0.7375 | 2 |
| MED1 | -0.1427 | -0.2138 | #FF6C91 | 0.7358 | Neuronal |
| RIN2 | 0.2519 | 0.1508 | #00A4FF | 0.7356 | Others |
| C3orf58 | 0.2204 | 0.6113 | #A789FF | 0.7346 | 3 |
| SPAST | 0.0746 | -0.2138 | #FF6C91 | 0.7309 | 1 |
| SOGA1 | 0.1969 | 0.6143 | #00AAFD | 0.7307 | Others |
| NETO2 | 0.1194 | 0.2018 | #3EA1FF | 0.7302 | Others |
| ELMO2 | 0.1032 | 0.6113 | #A789FF | 0.7287 | Others |
| CWC22 | -0.0095 | 0.2073 | #D575FE | 0.7279 | Others |
| HIP1 | 0.2566 | 0.2407 | #FF61C6 | 0.7270 | Others |
| BACH2 | 0.0527 | 0.2208 | #00B92D | 0.7269 | Others |
| MKL2 | 0.0595 | 0.6113 | #A789FF | 0.7268 | 3 |
| YES1 | 0.0980 | 0.1508 | #00A4FF | 0.7267 | Others |
| NUDCD1 | 0.4902 | 0.6113 | #A789FF | 0.7253 | Others |
| SASS6 | -0.0247 | -0.2138 | #FF6C91 | 0.7240 | Others |
| RNF144A | 0.0537 | 0.2208 | #00B92D | 0.7238 | Others |
| LIN7C | 0.1267 | -0.2138 | #FF6C91 | 0.7228 | Neuronal |
| SQLE | 0.1144 | 0.2190 | #FC7180 | 0.7209 | Others |
| CDK19 | 0.3077 | 0.2018 | #3EA1FF | 0.7205 | Others |
| FLRT1 | -0.1712 | 0.0400 | #B0A100 | 0.7204 | Others |
| NUP214 | -0.2512 | 0.0400 | #B0A100 | 0.7198 | Others |
| BCL9L | -0.1023 | 0.4586 | #D19300 | 0.7195 | Others |
| NRXN3 | -0.1909 | -0.1972 | #CC9600 | 0.7193 | 1 |
| ZNF772 | 0.3303 | 0.1508 | #00A4FF | 0.7190 | Others |
| TJP1 | 0.1871 | 0.2208 | #00B92D | 0.7180 | Others |
| MBLAC2 | 0.1260 | 0.2018 | #3EA1FF | 0.7177 | Others |
| SIRT1 | -0.0796 | 0.2079 | #00BDD4 | 0.7162 | Others |
| SSH2 | 0.2467 | 0.4406 | #00BB4B | 0.7155 | Others |
| ZNF235 | 0.1638 | 0.2208 | #00B92D | 0.7141 | Others |
| HOMER2 | -0.1697 | -0.3275 | #FA7477 | 0.7138 | Neuronal |
| MAPK1IP1L | -0.1771 | 0.0832 | #00BD61 | 0.7136 | Others |
| FAM135A | 0.0284 | 0.2018 | #3EA1FF | 0.7126 | Others |
| CWF19L2 | -0.0711 | 0.2079 | #00BDD4 | 0.7102 | Others |
| OXSR1 | -0.2093 | -0.3275 | #FA7477 | 0.7075 | Others |
The objective of this model is to identify candidate SFARI genes. For this, we are going to focus on the negative samples (the non-SFARI genes)
negative_set = test_set %>% filter(!SFARI)
negative_set_table = negative_set %>% apply_labels(prob = 'Assigned Probability',
pred = 'Label Prediction')
cro(negative_set_table$pred)
| #Total | |
|---|---|
| Label Prediction | |
| FALSE | 10230 |
| TRUE | 2702 |
| #Total cases | 12932 |
2702 genes are predicted as ASD-related
negative_set %>% ggplot(aes(prob)) + geom_density(color='#F8766D', fill='#F8766D', alpha=0.5) +
geom_vline(xintercept=0.5, color='#333333', linetype='dotted') + xlab('Probability') +
ggtitle('Probability distribution of the Negative samples in the Test Set') +
theme_minimal()
negative_set %>% ggplot(aes(prob, GS, color = MTcor)) + geom_point() +
geom_smooth(method = 'loess', color = '#666666') +
geom_hline(yintercept = 0, color='gray', linetype='dashed') +
xlab('Probability') + ylab('Gene Significance') +
scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) +
ggtitle('Relation between Probability and Gene Significance') + theme_minimal()
There’s not a strong relation between the Module-Diagnosis correlation of the genes assigned module and the probability assigned by the model
The model seems to assign slightly higher probabilities to genes belonging the modules with positive module-Dianosis correlations than to modules with negative Module-Dagnosis correlation
negative_set %>% ggplot(aes(MTcor, prob, color=GS)) + geom_point() +
geom_smooth(method='loess', color='#666666') +
geom_hline(yintercept=mean(negative_set$prob), color='gray', linetype='dashed') +
scale_color_gradientn(colours=c('#F8766D','#F8766D','white','#00BFC4','#00BFC4')) +
xlab('Modules ordered by their correlation to ASD') + ylab('Model probability') +
theme_minimal()
The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same
Again, the model seems to give lower probabilities to genes belonging to modules with a high negative correlation to Diagnosis than to the rest
plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob), n = n()) %>%
mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>%
left_join(original_dataset, by='MTcor') %>%
dplyr::select(Module, MTcor, MTcor_sign, mean, sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'
ggplotly(plot_data %>% ggplot(aes(MTcor, mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID), alpha=0.7) +
geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) +
xlab('Module-Diagnosis correlation') + ylab('Mean Probability by Module') + theme_minimal())
# Gupta dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
The relation between mean level of expression of the genes and the probability assigned by the model seems to be negative!
mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))
plot_data = negative_set %>% left_join(mean_and_sd, by='ID') %>%
left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>%
dplyr::select(ID, Module), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'
plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.2, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) +
geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') +
xlab('Mean Expression') + ylab('Probability') +
ggtitle('Mean expression vs model probability by gene') +
theme_minimal()
rm(mean_and_sd)
Grouping the genes by module we see the same negative correlation!
plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob),
n=n())
ggplotly(plot_data2 %>% ggplot(aes(meanExpr, meanProb, size=n)) +
geom_point(color=plot_data2$Module, alpha = 0.6) +
geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) +
geom_smooth(method='lm', color='#808080', se=FALSE, linetype = 'dashed') +
xlab('Mean Level of Expression') + ylab('Average Model Probability') +
theme_minimal() + theme(legend.position='none') +
ggtitle('Mean expression vs model probability by Module'))
rm(plot_data2)
I’m not sure if there is a relation between LFC and probability, perhaps there is for genes with a positive LFC, but for negative genes, the relation seems like it could be negative
plot_data = negative_set %>% left_join(DE_info %>% mutate(ID=datGenes$ensembl_gene_id), by='ID') %>%
mutate(log2FoldChange = logFC, padj = adj.P.Val)
plot_data %>% ggplot(aes(log2FoldChange, prob)) + geom_point(alpha=0.1, color='#0099cc') +
geom_smooth(method='loess', color='gray', alpha=0.3) + xlab('LFC') +
xlab('LFC') + ylab('Probability') +
theme_minimal() + ggtitle('LFC vs model probability by gene')
Differentially expressed genes have a lower probabaility than non-differentally expressed genes
p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>%
ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') +
ylim(c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))
p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>%
ggplot(aes(log2FoldChange, prob, color=DE)) + geom_point(alpha=0.2) +
geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Probability') + ylab('') +
scale_y_continuous(position = 'right', limits = c(min(plot_data$prob), max(plot_data$prob))) +
theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())
grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')
rm(p1, p2)
The results from this model are very different from the ones obtained ine the other three datasets:
There is a bias in probability assigned by the model and level of expression, but it is a negative one: the higher the level of expression of a gene, the lower the probability it is assigned by the model
There doesn’t seem to be a noticeable relation between LFC and probability assigned by the model
Still, the model seems to be able to identify SFARI Genes, as we can see from the lift curve
predictions = test_set %>% left_join(gene_names, by = c('ID' = 'ensembl_gene_id'))
save(predictions, dataset, fit, file='./../Data/Ridge_model.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] DMwR_0.4.1 doParallel_1.0.15 iterators_1.0.12 foreach_1.5.0
## [5] kableExtra_1.1.0 expss_0.10.2 corrplot_0.84 MLmetrics_1.1.1
## [9] car_3.0-7 carData_3.0-3 ROCR_1.0-7 gplots_3.0.3
## [13] caret_6.0-86 lattice_0.20-41 polycor_0.7-10 biomaRt_2.40.5
## [17] ggpubr_0.2.5 magrittr_1.5 RColorBrewer_1.1-2 gridExtra_2.3
## [21] viridis_0.5.1 viridisLite_0.3.0 plotly_4.9.2 knitr_1.28
## [25] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4
## [29] readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 ggplot2_3.3.2
## [33] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.8 plyr_1.8.6
## [4] lazyeval_0.2.2 splines_3.6.3 crosstalk_1.1.0.1
## [7] digest_0.6.25 htmltools_0.4.0 gdata_2.18.0
## [10] fansi_0.4.1 checkmate_2.0.0 memoise_1.1.0
## [13] openxlsx_4.1.4 recipes_0.1.10 modelr_0.1.6
## [16] gower_0.2.1 matrixStats_0.56.0 xts_0.12-0
## [19] prettyunits_1.1.1 colorspace_1.4-1 blob_1.2.1
## [22] rvest_0.3.5 haven_2.2.0 xfun_0.12
## [25] crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.7.0
## [28] zoo_1.8-8 survival_3.1-12 glue_1.4.1
## [31] gtable_0.3.0 ipred_0.9-9 webshot_0.5.2
## [34] shape_1.4.4 quantmod_0.4.17 BiocGenerics_0.30.0
## [37] abind_1.4-5 scales_1.1.1 DBI_1.1.0
## [40] miniUI_0.1.1.1 Rcpp_1.0.4.6 xtable_1.8-4
## [43] progress_1.2.2 htmlTable_1.13.3 foreign_0.8-76
## [46] bit_1.1-15.2 stats4_3.6.3 lava_1.6.7
## [49] prodlim_2019.11.13 glmnet_3.0-2 htmlwidgets_1.5.1
## [52] httr_1.4.1 ellipsis_0.3.1 pkgconfig_2.0.3
## [55] XML_3.99-0.3 farver_2.0.3 nnet_7.3-14
## [58] dbplyr_1.4.2 later_1.0.0 tidyselect_1.1.0
## [61] labeling_0.3 rlang_0.4.6 reshape2_1.4.4
## [64] AnnotationDbi_1.46.1 munsell_0.5.0 cellranger_1.1.0
## [67] tools_3.6.3 cli_2.0.2 generics_0.0.2
## [70] RSQLite_2.2.0 broom_0.5.5 fastmap_1.0.1
## [73] evaluate_0.14 yaml_2.2.1 ModelMetrics_1.2.2.2
## [76] bit64_0.9-7 fs_1.4.0 zip_2.0.4
## [79] caTools_1.18.0 nlme_3.1-147 mime_0.9
## [82] ggExtra_0.9 xml2_1.2.5 compiler_3.6.3
## [85] rstudioapi_0.11 curl_4.3 ggsignif_0.6.0
## [88] reprex_0.3.0 stringi_1.4.6 highr_0.8
## [91] Matrix_1.2-18 vctrs_0.3.1 pillar_1.4.4
## [94] lifecycle_0.2.0 data.table_1.12.8 bitops_1.0-6
## [97] httpuv_1.5.2 R6_2.4.1 promises_1.1.0
## [100] KernSmooth_2.23-17 rio_0.5.16 IRanges_2.18.3
## [103] codetools_0.2-16 MASS_7.3-51.6 gtools_3.8.2
## [106] assertthat_0.2.1 withr_2.2.0 S4Vectors_0.22.1
## [109] mgcv_1.8-31 hms_0.5.3 rpart_4.1-15
## [112] timeDate_3043.102 class_7.3-17 rmarkdown_2.1
## [115] TTR_0.23-6 pROC_1.16.2 shiny_1.4.0.2
## [118] Biobase_2.44.0 lubridate_1.7.4